#encoding=utf-8
import json
import string
import csv

in_path = '../data/gpt3cn_generations/finetuning_data/gpt3_finetuning_data_107.jsonl'
out_path = '../data/gpt3cn_generations/finetuning_data/gpt3_finetuning_data_107_chinese.csv'
# {"prompt":"xxxx",  "completion":"yyyyy"}


def clear_str(str):
    if str.startswith('Task:'):
        str = str[5:]
    if str.startswith('Output:'):
        str = str[7:]
    if str.startswith(' Output:'):
        str = str[8:]
    if str.endswith('Output:'):
        str = str[:-7]
    str = str.rstrip()
    # 添加一个空行
    str += '\n'
    return str


with open(in_path, "r", encoding='utf-8') as fin:
    lines = fin.readlines()
    alldata = []
    for line in lines:
        data = json.loads(line)
        prompt = clear_str(data['prompt'])
        output = clear_str(data['completion'][:-13])
        item = {"question": prompt, "answer": output}
        alldata.append(item)
    with open(out_path, 'w', newline='', encoding='utf-8') as file:
        # 创建一个csv.DictWriter对象，用于写入字典格式的数据
        writer = csv.DictWriter(file, fieldnames=['question', 'answer'])
        # 写入表头
        writer.writeheader()
        # 写入数据
        for item in alldata:
            writer.writerow(item)

